First we calculate a correlation matrix for the tags.
practices_cor <- tags %>%
select(starts_with("practices")) %>%
cor
practices_jac <- tags %>%
select(starts_with("practices")) %>%
simil(method = "Jaccard", by_rows = FALSE)
plot_tag_cor <- function(cor_mat, title = "") {
ggcorrplot(cor_mat, hc.order = T, type = "upper") +
scale_fill_distiller(type = "div", limits = c(-1, 1), expand = c(0, 0)) +
labs(title = title,
fill = "Correlation") +
#scale_x_discrete(labels = label_tags) +
#scale_y_discrete(labels = label_tags) +
labs(x = "", y = "") +
#scale_fill_cc_gradient +
# theme_transcend_sparse +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank()
)
}cor_plot <- plot_tag_cor(practices_cor,
title = "Correlation heat map for all tags") +
theme(legend.position = c(.85, .25))
ggplotly(cor_plot)jac_plot <- plot_tag_cor(as.matrix(practices_jac),
title = "Jaccard similarity for all tags") +
theme(legend.position = c(.85, .25)) +
scale_fill_distiller(type = "div", limits = c(0, 1), expand = c(0, 0))
ggplotly(jac_plot)mat_to_df <- function(m) {
data.frame(row=rownames(m)[row(m)[upper.tri(m)]],
col=colnames(m)[col(m)[upper.tri(m)]],
corr=m[upper.tri(m)])
}
d_corr <- mat_to_df(practices_cor)
d_jacc <- mat_to_df(as.matrix(practices_jac))
jacc_corr <- d_jacc %>%
rename(jaccard = corr) %>%
left_join(d_corr, by = c("row", "col")) %>%
rename(correlation = corr) %>%
mutate(
jacc_rank = rank(-jaccard),
corr_rank = rank(-correlation),
rank_diff = jacc_rank - corr_rank
) %>%
arrange(desc(jaccard))
jacc_corr %>%
datatable(
caption = "Comparison of Jaccard Similarity and Correlation Coeffficients",
colnames = c("Jaccard (0,1)" = "jaccard", "Correlation (-1,1)" = "correlation")
) %>%
formatRound(digits = 2, columns = c("Jaccard (0,1)", "Correlation (-1,1)")) corr_hist <- ggplot(jacc_corr, aes(x = correlation)) +
geom_histogram(binwidth = 0.03 #, fill = transcend_cols["blue"]
) +
geom_vline(aes(xintercept = mean(correlation)),
#color = transcend_cols["red"],
size = 1) +
geom_text(aes(x = mean(correlation), y = Inf,
label = paste("Average:",round(mean(correlation), 2))),
hjust = -.1, check_overlap = TRUE, vjust = 1.1,
family = "Open Sans") +
# bar_y_scale_count +
scale_x_continuous(limits = c(-1, 1), expand = expansion(0, 0)) +
labs(title = "Distribution of pairwise tag correlations",
y = "Count of Tag Pairs",
x = "Correlation") +
theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt")) +
theme_minimal()## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jacc_hist <- ggplot(jacc_corr, aes(x = jaccard)) +
geom_histogram(binwidth = 0.03 #, fill = transcend_cols["teal"]
) +
geom_vline(aes(xintercept = mean(jaccard)),
#color = transcend_cols["red"],
size = 1) +
geom_text(aes(x = mean(jaccard), y = Inf,
label = paste("Average:", round(mean(jaccard), 2))),
hjust = -.1, check_overlap = TRUE, vjust = 1.1,
family = "Open Sans") +
# bar_y_scale_count +
scale_x_continuous(limits = c(0, 1), expand = expansion(0, 0)) +
labs(title = "Distribution of pairwise tag similarities",
y = "Count of Tag Pairs",
x = "Jaccard Similarity") +
theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt")) +
theme_minimal()
corr_hist + jacc_hist## Warning: Removed 2 rows containing missing values (`geom_bar()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
fa_cor <- fa.parallel(
practices_cor, fm = "pa", fa = "fa", n.obs = nrow(tags),
main = "Correlation scree plot"
)## Parallel analysis suggests that the number of factors = 11 and the number of components = NA
jac_mat <- as.matrix(practices_jac)
jac_mat[is.na(jac_mat)] = 1
fa_jac <- fa.parallel(jac_mat, fm = "pa", fa = "fa",
n.obs = nrow(tags),
main = "Jaccard scree plot"
)## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
# n_i <- nrow(values_3) # The number of cases in our data
# n_p <- ncol(values_3) # The number of variables in our data
#
# set.seed(2) # To reproduce our randomly generated results.
#
# Eigs <- pca_3$values # The eigenvalues
# n_components <- length(Eigs) # number of components
#
# paral <- parallel(subject = n_i, # The number of cases in our data
# var = n_p, # The number of variables in our data
# rep = 1000,
# quantile = .95,
# model = "components")
#
# ParallelAna <- data.frame(Ncomponent = 1:n_components,
# Eigs,
# RandEigM = paral$eigen$mevpea,
# RandEig95= paral$eigen$qevpea)
#
# ParallelAna <- round(ParallelAna, 3)
# ParallelAna# exceeder <- ParallelAna[ParallelAna[, "RandEig95"] > ParallelAna[, "Eigs"], ][1,]
# exceederefa_2023 %>%
model_parameters(sort = TRUE, threshold = "max") %>%
write_tsv(here("output", "EFA 2023 Results Max.txt"), na = "") %>%
datatable(caption = "Max loadings") %>%
formatRound(digits = 2, columns = 2:8) efa_2023 %>%
model_parameters(sort = TRUE, threshold = 0.28) %>%
write_tsv(here("output", "EFA 2023 Results Threshold.txt"), na = "") %>%
datatable(caption = "Threshold loadings") %>%
formatRound(digits = 2, columns = 2:8) efa_2023 %>%
model_parameters(sort = TRUE) %>%
write_tsv(here("output", "EFA 2023 Results All.txt"), na = "") %>%
datatable(caption = "All loadings") %>%
formatRound(digits = 2, columns = 2:8) # Warning message included below, just in case:
# DataTables warning: table id=DataTables_Table_0 - Requested unknown parameter '7' for row 0, column 7. For more information about this error, please see http://datatables.net/tn/4